#include <math.h>
#include <float.h>

#include "mips.h"
#include "intvfpu.h"
#include "vfpuutil.h"
#include "../log.h"
#include "int.h"

#define M_E        2.7182818284590452354   /* e */
#define M_LOG2E    1.4426950408889634074   /* log_2 e */
#define M_LOG10E   0.43429448190325182765  /* log_10 e */
#define M_LN2      0.69314718055994530942  /* log_e 2 */
#define M_LN10     2.30258509299404568402  /* log_e 10 */
#define M_PI       3.14159265358979323846 /* pi */
#define M_PI_2     1.57079632679489661923 /* pi/2 */

#define V(i)  (cpu.v[i])
#define VI(i) (*(u32*)&cpu.v[i])

#define S_not(a, b, c) (a << 2) | (b) | (c << 5)
#define SgetA(v) (((v) >> 2) & 0x7)
#define SgetB(v) ((v) & 3)
#define SgetC(v) (((v) >> 5) & 0x3)
#define VS(m, row, col) V(m * 4 + (row) + (col) * 32)
#define VADDR   (R(_RS) + (s16)(op & 0xfffc))
#define AVADDR ((R(_RS) + (s16)(op & 0xfffc)) & ~0xf)
#define VT ((op & 1) << 5) | ((op >> 16) & 0x1f)

#define VFPUBRANCHVAL (cpu.vfpuCtrl[VFPU_CTRL_CC] >> _CC) & 1

#define DEBUGME() _log(WRN, VFPU, "Called VFPU function %s", __FUNCTION__)

void ApplyPrefixST(float *v, s32 data, DataSize size)
{
    s32 n = GetNumElements(size);
    float origV[4];
    static const float constantArray[8] = {0.f, 1.f, 2.f, 0.5f, 3.f, 1.f / 3.f, 0.25f, 1.f / 6.f};
    s32 i;

    for (i = 0; i < n; i++)
        origV[i] = v[i];

    for (i = 0; i < n; i++)
    {
        s32 regnum = (data >> (i * 2)) & 3;
        s32 abs    = (data >> (8 + i)) & 1;
        s32 negate = (data >> (16 + i)) & 1;
        s32 constants = (data >> (12 + i)) & 1;

        if (!constants)
        {
            v[i] = origV[regnum];
            if (abs)
                v[i] = fabs(v[i]);
        }
        else
            v[i] = constantArray[regnum + (abs << 2)];

        if (negate)
            v[i] = -v[i];
    }
}

void ApplySwizzleS(float *v, DataSize size)
{
    ApplyPrefixST(v, cpu.vfpuCtrl[VFPU_CTRL_PFXS], size);
}

void ApplySwizzleT(float *v, DataSize size)
{
    ApplyPrefixST(v, cpu.vfpuCtrl[VFPU_CTRL_PFXT], size);
}

void ApplyPrefixD(float *v, DataSize size)
{
    s32 n = GetNumElements(size);
    s32 data = cpu.vfpuCtrl[VFPU_CTRL_PFXD];
    s32 i;
    for (i = 0; i < n; i++)
    {
        s32 sat = (data >> i * 2) & 3;
        /* s32 mask = (data >> (8 + i)) & 1; */
        if (sat == 1)
        {
            if (v[i] > 1.0f)
                v[i] = 1.0f;
            if (v[i] < 0.0f)
                v[i] = 0.0f;
        }
        else if (sat == 3)
        {
            if (v[i] > 1.0f) 
                v[i] = 1.0f;
            if (v[i] < -1.0f)
                v[i] = -1.0f;
        }
    }
}

void EatPrefixes()
{
    s32 noWriteMask[4] = {0};
    cpu.vfpuCtrl[VFPU_CTRL_PFXS] = 0xe4; /* passthru */
    cpu.vfpuCtrl[VFPU_CTRL_PFXT] = 0xe4; /* passthru */
    cpu.vfpuCtrl[VFPU_CTRL_PFXD] = 0;
    SetWriteMask(noWriteMask);
}

void int_vpfxs(u32 op)
{
    DEBUGME();
    cpu.vfpuCtrl[VFPU_CTRL_PFXS] = op & 0xffff;
}

void int_vpfxt(u32 op)
{
    DEBUGME();
    cpu.vfpuCtrl[VFPU_CTRL_PFXT] = op & 0xffff;
}

void int_vpfxd(u32 op)
{
    DEBUGME();
    cpu.vfpuCtrl[VFPU_CTRL_PFXD] = op & 0xff;
}

void int_lvs(u32 op)
{
    float v = mem_readFloat(VADDR);

    DEBUGME();
    WriteVector(&v, V_Single, VT);
}

void int_lvq(u32 op)
{
    u8 i;
    float v[4];

    DEBUGME();
    for (i = 0; i < 4; i++)
        v[i] = mem_readFloat(AVADDR + i * 4); /* writing to aligned address */
    WriteVector(v, V_Quad, VT);
}

void int_ulvq(u32 op)
{
    u8 i;
    float v[4];

    DEBUGME();
    for (i = 0; i < 4; i++)
        v[i] = mem_readFloat(VADDR + i * 4);
    WriteVector(v, V_Quad, VT);
}

void int_svs(u32 op)
{
    float v;

    DEBUGME();
    ReadVector(&v, V_Single, VT);
    mem_writeFloat(VADDR, v);
}

void int_svq(u32 op)
{
    float v[4];
    u8 i;

    DEBUGME();
    ReadVector(v, V_Quad, VT);
    for (i = 0; i < 4; i++)
        mem_writeFloat(AVADDR + i * 4, v[i]); /* writing to aligned address*/
}

void int_usvq(u32 op)
{
    float v[4];
    u8 i;

    DEBUGME();
    ReadVector(v, V_Quad, VT);
    for (i = 0; i < 4; i++)
        mem_writeFloat(VADDR + i * 4, v[i]);
}

void int_vmidt(u32 op)
{
    float idt[16] =
    {
        1, 0, 0, 0,
        0, 1, 0, 0,
        0, 0, 1, 0,
        0, 0, 0, 1
    };
    DataSize sz = GetMtxSize(op);
    DEBUGME();
    WriteMatrix(idt, sz, 4, _VD);
    EatPrefixes();
}

void int_vmzero(u32 op)
{
    float zero[16] = {0};
    DataSize sz = GetMtxSize(op);
    DEBUGME();
    WriteMatrix(zero, sz, 4, _VD);
    EatPrefixes();
}

void int_vmone(u32 op)
{
    float one[16] =
    {
        1, 1, 1, 1,
        1, 1, 1, 1,
        1, 1, 1, 1,
        1, 1, 1, 1
    };
    DataSize sz = GetMtxSize(op);
    DEBUGME();
    WriteMatrix(one, sz, 4, _VD);
    EatPrefixes();
}

void int_vzero(u32 op)
{
    DataSize sz = GetVecSize(op);
    float o[4] = {0};
    DEBUGME();
    ApplyPrefixD(o, sz);
    WriteVector(o, sz, _VD);
}

void int_vone(u32 op)
{
    DataSize sz = GetVecSize(op);
    float o[4] = {1, 1, 1, 1};
    DEBUGME();
    ApplyPrefixD(o, sz);
    WriteVector(o, sz, _VD);
}

void int_vfim(u32 op)
{
    DEBUGME();
    V(_VT) = Float16ToFloat32(op & 0xffff);
    EatPrefixes();
}

void int_viim(u32 op)
{
    DEBUGME();
    V(_VT) = (float)(op & 0xffff);
    EatPrefixes();
}

void int_vidt(u32 op)
{
    DataSize sz = GetVecSize(op);
    float f[4];
    u8 id = _VD & 3;
    u8 i;
    DEBUGME();

    for (i = 0; i < sz; i++)
        f[i] = (id == i) ? 1.0f : 0.0f;

    WriteVector(f, sz, _VD);
    EatPrefixes();
}

void int_vmmul(u32 op)
{
    float s[16];
    float t[16];
    float d[16] = {0};

    DataSize sz = GetMtxSize(op);
    u8 n = GetMatrixSide(sz);
    u8 a, b, c;
    DEBUGME();

    ReadMatrix(s, sz, 4, _VS);
    ReadMatrix(t, sz, 4, _VT);
    for (a = 0; a < n; a++)
        for (b = 0; b < n; b++)
            for (c = 0; c < n; c++)
                d[a * 4 + b] += s[b * 4 + c] * t[a * 4 + c];

    WriteMatrix(d, sz, 4, _VD);
    EatPrefixes();
}

void int_vmmov(u32 op)
{
    float s[16];
    DataSize sz = GetMtxSize(op);
    DEBUGME();
    ReadMatrix(s, sz, 4, _VS);
    WriteMatrix(s, sz, 4, _VD);
    EatPrefixes();
}

void int_vmov(u32 op)
{
    float s[4], d[4];
    u8 i;
    DataSize sz = GetVecSize(op);

    DEBUGME();
    ReadVector(s, sz, _VS);
    ApplySwizzleS(s, sz);

    for (i = 0; i < GetNumElements(sz); i++)
        d[i] = s[i];

    ApplyPrefixD(d, sz);
    WriteVector(d, sz, _VD);
    EatPrefixes();
}

void int_vabs(u32 op)
{
    float s[4], d[4];
    u8 i;
    DataSize sz = GetVecSize(op);

    DEBUGME();
    ReadVector(s, sz, _VS);
    ApplySwizzleS(s, sz);

    for (i = 0; i < GetNumElements(sz); i++)
        d[i] = fabs(s[i]);

    ApplyPrefixD(d, sz);
    WriteVector(d, sz, _VD);
    EatPrefixes();
}

void int_vneg(u32 op)
{
    float s[4], d[4];
    u8 i;
    DataSize sz = GetVecSize(op);

    DEBUGME();
    ReadVector(s, sz, _VS);
    ApplySwizzleS(s, sz);

    for (i = 0; i < GetNumElements(sz); i++)
        d[i] = -s[i];

    ApplyPrefixD(d, sz);
    WriteVector(d, sz, _VD);
    EatPrefixes();
}

void int_vsat0(u32 op)
{
    float s[4], d[4];
    u8 i;
    DataSize sz = GetVecSize(op);

    DEBUGME();
    ReadVector(s, sz, _VS);
    ApplySwizzleS(s, sz);

    for (i = 0; i < GetNumElements(sz); i++)
    {
        if (s[i] < 0)
            d[i] = 0;
        else if (s[i] > 1)
            d[i] = 1;
        else
            d[i] = s[i];
    }

    ApplyPrefixD(d, sz);
    WriteVector(d, sz, _VD);
    EatPrefixes();
}

void int_vsat1(u32 op)
{
    float s[4], d[4];
    u8 i;
    DataSize sz = GetVecSize(op);

    DEBUGME();
    ReadVector(s, sz, _VS);
    ApplySwizzleS(s, sz);

    for (i = 0; i < GetNumElements(sz); i++)
    {
        if (s[i] < -1)
            d[i] = -1;
        else if (s[i] > 1)
            d[i] = 1;
        else
            d[i] = s[i];
    }

    ApplyPrefixD(d, sz);
    WriteVector(d, sz, _VD);
    EatPrefixes();
}

void int_vrcp(u32 op)
{
    float s[4], d[4];
    u8 i;
    DataSize sz = GetVecSize(op);

    DEBUGME();
    ReadVector(s, sz, _VS);
    ApplySwizzleS(s, sz);

    for (i = 0; i < GetNumElements(sz); i++)
        d[i] = 1.0f / s[i];

    ApplyPrefixD(d, sz);
    WriteVector(d, sz, _VD);
    EatPrefixes();
}

void int_vrsq(u32 op)
{
    float s[4], d[4];
    u8 i;
    DataSize sz = GetVecSize(op);

    DEBUGME();
    ReadVector(s, sz, _VS);
    ApplySwizzleS(s, sz);

    for (i = 0; i < GetNumElements(sz); i++)
        d[i] = 1.0f / sqrt(s[i]);

    ApplyPrefixD(d, sz);
    WriteVector(d, sz, _VD);
    EatPrefixes();
}

void int_vsin(u32 op)
{
    float s[4], d[4];
    u8 i;
    DataSize sz = GetVecSize(op);

    DEBUGME();
    ReadVector(s, sz, _VS);
    ApplySwizzleS(s, sz);

    for (i = 0; i < GetNumElements(sz); i++)
        d[i] = sin(M_PI_2 * s[i]);

    ApplyPrefixD(d, sz);
    WriteVector(d, sz, _VD);
    EatPrefixes();
}

void int_vcos(u32 op)
{
    float s[4], d[4];
    u8 i;
    DataSize sz = GetVecSize(op);

    DEBUGME();
    ReadVector(s, sz, _VS);
    ApplySwizzleS(s, sz);

    for (i = 0; i < GetNumElements(sz); i++)
        d[i] = cos(M_PI_2 * s[i]);

    ApplyPrefixD(d, sz);
    WriteVector(d, sz, _VD);
    EatPrefixes();
}

void int_vexp2(u32 op)
{
    float s[4], d[4];
    u8 i;
    DataSize sz = GetVecSize(op);

    DEBUGME();
    ReadVector(s, sz, _VS);
    ApplySwizzleS(s, sz);

    for (i = 0; i < GetNumElements(sz); i++)
        d[i] = pow(2.0f, s[i]);

    ApplyPrefixD(d, sz);
    WriteVector(d, sz, _VD);
    EatPrefixes();
}

void int_vlog2(u32 op)
{
    float s[4], d[4];
    u8 i;
    DataSize sz = GetVecSize(op);

    DEBUGME();
    ReadVector(s, sz, _VS);
    ApplySwizzleS(s, sz);

    for (i = 0; i < GetNumElements(sz); i++)
        d[i] = log(s[i]) / log(2.0f);

    ApplyPrefixD(d, sz);
    WriteVector(d, sz, _VD);
    EatPrefixes();
}

void int_vsqrt(u32 op)
{
    float s[4], d[4];
    u8 i;
    DataSize sz = GetVecSize(op);

    DEBUGME();
    ReadVector(s, sz, _VS);
    ApplySwizzleS(s, sz);

    for (i = 0; i < GetNumElements(sz); i++)
        d[i] = sqrt(s[i]);

    ApplyPrefixD(d, sz);
    WriteVector(d, sz, _VD);
    EatPrefixes();
}

void int_vasin(u32 op)
{
    float s[4], d[4];
    u8 i;
    DataSize sz = GetVecSize(op);

    DEBUGME();
    ReadVector(s, sz, _VS);
    ApplySwizzleS(s, sz);

    for (i = 0; i < GetNumElements(sz); i++)
        d[i] = asin(s[i]) * 2.0f / M_PI;

    ApplyPrefixD(d, sz);
    WriteVector(d, sz, _VD);
    EatPrefixes();
}

void int_vnrcp(u32 op)
{
    float s[4], d[4];
    u8 i;
    DataSize sz = GetVecSize(op);

    DEBUGME();
    ReadVector(s, sz, _VS);
    ApplySwizzleS(s, sz);

    for (i = 0; i < GetNumElements(sz); i++)
        d[i] = -1.0f / s[i];

    ApplyPrefixD(d, sz);
    WriteVector(d, sz, _VD);
    EatPrefixes();
}

void int_vnsin(u32 op)
{
    float s[4], d[4];
    u8 i;
    DataSize sz = GetVecSize(op);

    DEBUGME();
    ReadVector(s, sz, _VS);
    ApplySwizzleS(s, sz);

    for (i = 0; i < GetNumElements(sz); i++)
        d[i] = -sin(M_PI_2 * s[i]);

    ApplyPrefixD(d, sz);
    WriteVector(d, sz, _VD);
    EatPrefixes();
}

void int_vrexp2(u32 op)
{
    float s[4], d[4];
    u8 i;
    DataSize sz = GetVecSize(op);

    DEBUGME();
    ReadVector(s, sz, _VS);
    ApplySwizzleS(s, sz);

    for (i = 0; i < GetNumElements(sz); i++)
        d[i] = 1.0f / pow(2.0f, s[i]);

    ApplyPrefixD(d, sz);
    WriteVector(d, sz, _VD);
    EatPrefixes();
}

void int_vf2in(u32 op)
{
    float s[4];
    s32 d[4];
    DataSize sz = GetVecSize(op);
    u8 imm = (op >> 16) & 0x1f;
    float mult = (float)(1 << imm);
    u8 i;

    DEBUGME();
    ReadVector(s, sz, _VS);
    ApplySwizzleS(s, sz); /* TODO: and the mask to kill everything but swizzle */

    for (i = 0; i < GetNumElements(sz); i++)
        d[i] = (s32)round(s[i] * mult);

    ApplyPrefixD((float*)d, sz); /* TODO: and the mask to kill everything but mask */
    WriteVector((float*)d, sz, _VD);
    EatPrefixes();
}

void int_vf2iz(u32 op)
{
    float s[4];
    s32 d[4];
    DataSize sz = GetVecSize(op);
    u8 imm = (op >> 16) & 0x1f;
    float mult = (float)(1 << imm);
    u8 i;

    DEBUGME();
    ReadVector(s, sz, _VS);
    ApplySwizzleS(s, sz); /* TODO: and the mask to kill everything but swizzle */

    for (i = 0; i < GetNumElements(sz); i++)
        d[i] = (s[i] >= 0) ? (s32)floor(s[i] * mult) : (s32)ceil(s[i] * mult);

    ApplyPrefixD((float*)d, sz); /* TODO: and the mask to kill everything but mask */
    WriteVector((float*)d, sz, _VD);
    EatPrefixes();
}

void int_vf2iu(u32 op)
{
    float s[4];
    s32 d[4];
    DataSize sz = GetVecSize(op);
    u8 imm = (op >> 16) & 0x1f;
    float mult = (float)(1 << imm);
    u8 i;

    DEBUGME();
    ReadVector(s, sz, _VS);
    ApplySwizzleS(s, sz); /* TODO: and the mask to kill everything but swizzle */

    for (i = 0; i < GetNumElements(sz); i++)
        d[i] = (s32)ceil(s[i] * mult);

    ApplyPrefixD((float*)d, sz); /* TODO: and the mask to kill everything but mask */
    WriteVector((float*)d, sz, _VD);
    EatPrefixes();
}

void int_vf2id(u32 op)
{
    float s[4];
    s32 d[4];
    DataSize sz = GetVecSize(op);
    u8 imm = (op >> 16) & 0x1f;
    float mult = (float)(1 << imm);
    u8 i;

    DEBUGME();
    ReadVector(s, sz, _VS);
    ApplySwizzleS(s, sz); /* TODO: and the mask to kill everything but swizzle */

    for (i = 0; i < GetNumElements(sz); i++)
        d[i] = (s32)floor(s[i] * mult);

    ApplyPrefixD((float*)d, sz); /* TODO: and the mask to kill everything but mask */
    WriteVector((float*)d, sz, _VD);
    EatPrefixes();
}

void int_vi2f(u32 op)
{
    s32 s[4];
    float d[4];
    u8 imm = (op >> 16) & 0x1f;
    float mult = 1.0f / (float)(1 << imm);
    DataSize sz = GetVecSize(op);
    u8 i;

    DEBUGME();
    ReadVector((float*)&s, sz, _VS);
    ApplySwizzleS((float*)&s, sz); /* TODO: and the mask to kill everything but swizzle */

    for (i = 0; i < GetNumElements(sz); i++)
        d[i] = (float)s[i] * mult;

    ApplyPrefixD(d, sz); /* TODO: and the mask to kill everything but mask */
    WriteVector(d, sz, _VD);
    EatPrefixes();
}

void int_vi2uc(u32 op)
{
    s32 s[4];
    u32 v = 0;
    u8 i;

    DEBUGME();
    ReadVector((float*)&s, V_Quad, _VS);
    ApplySwizzleS((float*)&s, 4); /* TODO: and the mask to kill everything but swizzle */

    for (i = 0; i < 4; i++)
        v |= (s[i] < 0) ? 0 : ((s[i] >> 24) << (i * 8));

    ApplyPrefixD((float*)&v, 1); /* TODO: and the mask to kill everything but mask */
    WriteVector((float*)&v, 1, _VD);
    EatPrefixes();
}

void int_vi2c(u32 op)
{
    s32 s[4];
    u32 v = 0;
    u8 i;

    DEBUGME();
    ReadVector((float*)&s, V_Quad, _VS);
    ApplySwizzleS((float*)&s, 4); /* TODO: and the mask to kill everything but swizzle */

    for (i = 0; i < 4; i++)
        v |= (s[i] >> 24) << (i * 8);

    ApplyPrefixD((float*)&v, 1); /* TODO: and the mask to kill everything but mask */
    WriteVector((float*)&v, 1, _VD);
    EatPrefixes();
}

void int_vi2us(u32 op)
{
    s32 s[4];
    u32 v[2] = {0};
    DataSize sz = GetVecSize(op);
    u8 i;

    DEBUGME();
    ReadVector((float*)&s, sz, _VS);
    ApplySwizzleS((float*)&s, sz); /* TODO: and the mask to kill everything but swizzle */

    for (i = 0; i < sz; i++)
        v[sz / 2] |= (s[i] < 0) ? 0 : ((s[i] >> 16) << (i * 16));

    ApplyPrefixD((float*)&v, sz / 2); /* TODO: and the mask to kill everything but mask */
    WriteVector((float*)&v, sz / 2, _VD);
    EatPrefixes();
}

void int_vi2s(u32 op)
{
    s32 s[4];
    u32 v[2] = {0};
    DataSize sz = GetVecSize(op);
    u8 i;

    DEBUGME();
    ReadVector((float*)&s, sz, _VS);
    ApplySwizzleS((float*)&s, sz); /* TODO: and the mask to kill everything but swizzle */

    for (i = 0; i < sz; i++)
        v[sz / 2] |= (s[i] >> 16) << (i * 16);

    ApplyPrefixD((float*)&v, sz / 2); /* TODO: and the mask to kill everything but mask */
    WriteVector((float*)&v, sz / 2, _VD);
    EatPrefixes();
}

void int_vdot(u32 op)
{
    float s[4], t[4];
    float d = 0;
    DataSize sz = GetVecSize(op);
    u8 n, i;

    DEBUGME();

    ReadVector(s, sz, _VS);
    ApplySwizzleS(s, sz);
    ReadVector(t, sz, _VT);
    ApplySwizzleT(t, sz);

    n = GetNumElements(sz);
    for (i = 0; i < n; i++)
        d += s[i] * t[i];
    ApplyPrefixD(&d, V_Single);
    V(_VD) = d;
    EatPrefixes();
}

void int_vbfy1(u32 op)
{
    float s[4], d[4];
    DataSize sz = GetVecSize(op);
    u8 n = GetNumElements(sz);
    u8 i;

    DEBUGME();
    ReadVector(s, sz, _VS);
    ApplySwizzleS(s, sz);
    for (i = 0; i < n; i += 2) {
        d[i]     = s[i] + s[i + 1];
        d[i + 1] = s[i] - s[i + 1];
    }
    ApplyPrefixD(d, sz);
    WriteVector(d, sz, _VD);
    EatPrefixes();
}

void int_vbfy2(u32 op)
{
    float s[4], d[4];
    DataSize sz = GetVecSize(op);

    DEBUGME();
    ReadVector(s, sz, _VS);
    ApplySwizzleS(s, sz);
    d[0] = s[0] + s[2];
    d[1] = s[1] + s[3];
    d[2] = s[0] - s[2];
    d[3] = s[1] - s[3];
    ApplyPrefixD(d, sz);
    WriteVector(d, sz, _VD);
    EatPrefixes();
}

void int_vcrs(u32 op)
{
    float s[4], t[4], d[4];
    DataSize sz = GetVecSize(op);

    DEBUGME();
    ReadVector(s, sz, _VS);
    ReadVector(t, sz, _VT);

    d[0] = s[1] * t[2];
    d[1] = s[2] * t[0];
    d[2] = s[0] * t[1];

    ApplyPrefixD(d, sz);
    WriteVector(d, sz, _VD);
    EatPrefixes();
}

void int_vfad(u32 op)
{
    float s[4];
    float d = 0;
    DataSize sz = GetVecSize(op);
    u8 n = GetNumElements(sz);
    u8 i;

    DEBUGME();
    ReadVector(s, sz, _VS);
    ApplySwizzleS(s, sz);

    for (i = 0; i < n; i++)
        d += s[i];

    ApplyPrefixD(&d, V_Single);
    V(_VD) = d;
    EatPrefixes();
}

void int_vscl(u32 op)
{
    float s[4], d[4];
    float scale = V(_VT);
    u8 n, i;
    DataSize sz = GetVecSize(op);

    DEBUGME();
    ReadVector(s, sz, _VS);
    ApplySwizzleS(s, sz);
    n = GetNumElements(sz);

    for (i = 0; i < n; i++)
        d[i] = s[i] * scale;

    ApplyPrefixD(d, sz);
    WriteVector(d, sz, _VD);
    EatPrefixes();
}

void int_vrot(u32 op)
{
    u32 imm = (op >> 16) & 0x1f;
    DataSize sz = GetVecSize(op);
    u8 n = GetNumElements(sz);
    float angle = V(_VS) * M_PI_2;
    s32 negSin = (imm & 0x10) ? 1 : 0;
    float sine = sin(angle);
    float cosine = cos(angle);
    u8 si = (imm >> 2) & 3;
    u8 ci = imm & 3;
    float d[4];
    u8 i;
    DEBUGME();

    if (negSin)
        sine = -sine;

    if (si == ci)
        for (i = 0; i < n; i++)
            d[i] = sine;
    else
    {
        for (i = 0; i < n; i++)
            d[i] = 0.0f;
        d[si] = sine;
    }
    d[ci] = cosine;
    WriteVector(d, sz, _VD);
    EatPrefixes();
}

void int_vhtfm2(u32 op)
{
    float s[4], t[1], d[2];

    DEBUGME();
    ReadVector(t, V_Single, _VT);
    ReadMatrix(s, V_2x2, 2, _VS);
    d[0] = s[0] * t[0] + s[1];
    d[1] = s[2] * t[0] + s[3];
    WriteVector(d, V_Pair, _VD);
    EatPrefixes();
}

void int_vtfm2(u32 op)
{
    float s[4], t[2], d[2];

    DEBUGME();
    ReadVector(t, V_Pair, _VT);
    ReadMatrix(s, V_2x2, 2, _VS);
    d[0] = s[0] * t[0] + s[1] * t[1];
    d[1] = s[2] * t[0] + s[3] * t[1];
    WriteVector(d, V_Pair, _VD);
    EatPrefixes();
}

void int_vhtfm3(u32 op)
{
    float s[9], t[2], d[3];

    DEBUGME();
    ReadVector(t, V_Pair, _VT);
    ReadMatrix(s, V_3x3, 3, _VS);
    d[0] = s[0] * t[0] + s[1] * t[1] + s[2];
    d[1] = s[3] * t[0] + s[4] * t[1] + s[5];
    d[2] = s[6] * t[0] + s[7] * t[1] + s[8];
    WriteVector(d, V_Triple, _VD);
    EatPrefixes();
}

void int_vtfm3(u32 op)
{
    float s[9], t[3], d[3];

    DEBUGME();
    ReadVector(t, V_Triple, _VT);
    ReadMatrix(s, V_3x3, 3, _VS);
    d[0] = s[0] * t[0] + s[1] * t[1] + s[2] * t[2];
    d[1] = s[3] * t[0] + s[4] * t[1] + s[5] * t[2];
    d[2] = s[6] * t[0] + s[7] * t[1] + s[8] * t[2];
    WriteVector(d, V_Triple, _VD);
    EatPrefixes();
}

void int_vhtfm4(u32 op)
{
    float s[16], t[3], d[4];

    DEBUGME();
    ReadVector(t, V_Triple, _VT);
    ReadMatrix(s, V_4x4, 4, _VS);
    d[0] = s[ 0] * t[0] + s[ 1] * t[1] + s[ 2] * t[2] + s[ 3];
    d[1] = s[ 4] * t[0] + s[ 5] * t[1] + s[ 6] * t[2] + s[ 7];
    d[2] = s[ 8] * t[0] + s[ 9] * t[1] + s[10] * t[2] + s[11];
    d[3] = s[12] * t[0] + s[13] * t[1] + s[14] * t[2] + s[15];
    WriteVector(d, V_Quad, _VD);
    EatPrefixes();
}

void int_vtfm4(u32 op)
{
    float s[16], t[4], d[4];

    DEBUGME();
    ReadVector(t, V_Quad, _VT);
    ReadMatrix(s, V_4x4, 4, _VS);
    d[0] = s[ 0] * t[0] + s[ 1] * t[1] + s[ 2] * t[2] + s[ 3] * t[3];
    d[1] = s[ 4] * t[0] + s[ 5] * t[1] + s[ 6] * t[2] + s[ 7] * t[3];
    d[2] = s[ 8] * t[0] + s[ 9] * t[1] + s[10] * t[2] + s[11] * t[3];
    d[3] = s[12] * t[0] + s[13] * t[1] + s[14] * t[2] + s[15] * t[3];
    WriteVector(d, V_Triple, _VD);
    EatPrefixes();
}

void int_mfv(u32 op)
{
    DEBUGME();
    R(_RT) = VI(_VR);
}

void int_mtv(u32 op)
{
    DEBUGME();
    VI(_VR) = R(_RT);
}

void int_vcst(u32 op)
{
    float constants[32] = 
    {
        0,
        FLT_MAX,
        sqrt(2.0f),
        sqrt(0.5f),
        2.0f / sqrt((float)M_PI),
        2.0f / (float)M_PI,
        1.0f / (float)M_PI,
        (float)M_PI_2 / 2,
        (float)M_PI_2,
        (float)M_PI,
        (float)M_E,
        (float)M_LOG2E,
        (float)M_LOG10E,
        (float)M_LN2,
        (float)M_LN10,
        2 * (float)M_PI,
        (float)M_PI_2 / 3,
        log10(2.0f),
        log2(10.0f),
        sqrt(3.0f) / 2.0f,
    };

    u8 id = (op >> 16) & 0x1f;
    DataSize sz = GetVecSize(op);
    u8 n = GetNumElements(sz);
    u8 i;
    float c = constants[id];
    float vec[4];

    DEBUGME();
    for (i = 0; i < n; i++)
        vec[i] = c;
    WriteMatrix(vec, sz, 4, _VD);
    EatPrefixes();
}

enum VCondition 
{
    VC_FL,
    VC_EQ,
    VC_LT,
    VC_LE,
    VC_TR,
    VC_NE,
    VC_GE,
    VC_GT,
    VC_EZ,
    VC_EN,
    VC_EI,
    VC_ES,
    VC_NZ,
    VC_NN,
    VC_NI,
    VC_NS
};

void int_vcmp(u32 op)
{
    u8 cond = op & 15;
    DataSize sz = GetVecSize(op);
    u8 n = GetNumElements(sz);
    float s[4];
    float t[4];
    u8 i;
    u8 cc = 0;
    u8 ore = 0;
    u8 ande = 1;

    DEBUGME();
    ReadVector(s, sz, _VS);
    ApplySwizzleS(s, sz);
    ReadVector(t, sz, _VT);
    ApplySwizzleT(t, sz);
    for (i = 0; i < n; i++)
    {
        u8 c;

        switch (cond)
        {
        case VC_EZ:
            c = (s[i] == 0.0f) || (s[i] == -0.0f);
            break;

        case VC_LT:
            c = s[i] < t[i];
            break;

        case VC_LE:
            c = s[i] <= t[i];
            break;

        case VC_TR:
            c = 1;
            break;

        case VC_FL:
            c = 0;
            break;

        case VC_NE:
            c = s[i] != t[i];
            break;

        case VC_GT:
            c = s[i] > t[i];
            break;

        case VC_GE:
            c = s[i] >= t[i];
            break;

        case VC_NZ:
            c = s[i] != 0;
            break;

        default:
            c = 0;
            _log(ERR, VFPU, "Unsupported vcmp condition code %d", cond);
            break;
        }

        cc |= (c << i);
        ore |= c;
        ande &= c;
    }
    cpu.vfpuCtrl[VFPU_CTRL_CC] = cc | (ore << 4) | (ande << 5);
    EatPrefixes();
}


void int_vcmov(u32 op)
{
    u8 imm3 = (op >> 16) & 7;
    u8 tf = (op >> 19) & 1;
    DataSize sz = GetVecSize(op);
    u8 n = GetNumElements(sz);
    float s[4], t[4], d[4];
    u8 i;
    u32 cc = cpu.vfpuCtrl[VFPU_CTRL_CC];

    DEBUGME();

    ReadVector(s, sz, _VS);
    ApplySwizzleS(s, sz);
    ReadVector(t, sz, _VD);
    ApplySwizzleT(t, sz);

    for (i = 0; i < n; i++)
        d[i] = t[i];

    if (imm3 < 6) {
        if ((cc >> imm3) & 1)
            for (i = 0; i < n; i++)
                d[i] = s[i];
    }
    else if (imm3 == 6) {
        for (i = 0; i < n; i++)
            if ((cc >> i) & 1)
                d[i] = s[i];
    }
    else if (tf)
        for (i = 0; i < n; i++)
            d[i] = s[i];

    ApplyPrefixD(d, sz);
    WriteVector(d, sz, _VD);
    EatPrefixes();
}

void int_vadd(u32 op)
{
    DataSize sz = GetVecSize(op);
    float s[4], t[4], d[4];
    u8 i;

    DEBUGME();
    ReadVector(s, sz, _VS);
    ApplySwizzleS(s, sz);
    ReadVector(t, sz, _VT);
    ApplySwizzleT(t, sz);

    for (i = 0; i < GetNumElements(sz); i++)
        d[i] = s[i] + t[i];
    
    ApplyPrefixD(d, sz);
    WriteVector(d, sz, _VD);
    EatPrefixes();
}

void int_vsub(u32 op)
{
    DataSize sz = GetVecSize(op);
    float s[4], t[4], d[4];
    u8 i;

    DEBUGME();
    ReadVector(s, sz, _VS);
    ApplySwizzleS(s, sz);
    ReadVector(t, sz, _VT);
    ApplySwizzleT(t, sz);

    for (i = 0; i < GetNumElements(sz); i++)
        d[i] = s[i] - t[i];
    
    ApplyPrefixD(d, sz);
    WriteVector(d, sz, _VD);
    EatPrefixes();
}

void int_vdiv(u32 op)
{
    DataSize sz = GetVecSize(op);
    float s[4], t[4], d[4];
    u8 i;

    DEBUGME();
    ReadVector(s, sz, _VS);
    ApplySwizzleS(s, sz);
    ReadVector(t, sz, _VT);
    ApplySwizzleT(t, sz);

    for (i = 0; i < GetNumElements(sz); i++)
        d[i] = s[i] / t[i];
    
    ApplyPrefixD(d, sz);
    WriteVector(d, sz, _VD);
    EatPrefixes();
}

void int_vmul(u32 op)
{
    DataSize sz = GetVecSize(op);
    float s[4], t[4], d[4];
    u8 i;

    DEBUGME();
    ReadVector(s, sz, _VS);
    ApplySwizzleS(s, sz);
    ReadVector(t, sz, _VT);
    ApplySwizzleT(t, sz);

    for (i = 0; i < GetNumElements(sz); i++)
        d[i] = s[i] * t[i];
    
    ApplyPrefixD(d, sz);
    WriteVector(d, sz, _VD);
    EatPrefixes();
}

void int_vcrsp(u32 op)
{
    float s[3], t[3], d[3];

    DEBUGME();
    ReadVector(s, 3, _VS);
    ReadVector(t, 3, _VT);
    d[0] = s[1] * t[2] - s[2] * t[1];
    d[1] = s[2] * t[0] - s[0] * t[2];
    d[2] = s[0] * t[1] - s[1] * t[0];
    WriteVector(d, 3, _VD);
    EatPrefixes();
}

void int_vqmul(u32 op)
{
    float s[4], t[4], d[4];

    DEBUGME();
    ReadVector(s, 4, _VS);
    ReadVector(t, 4, _VT);
    d[0] = +s[0] * t[3] + s[1] * t[2] - s[2] * t[1] + s[3] * t[0];
    d[1] = -s[0] * t[2] + s[1] * t[3] + s[2] * t[0] + s[3] * t[1];
    d[2] = +s[0] * t[1] - s[1] * t[0] + s[2] * t[3] + s[3] * t[2];
    d[3] = -s[0] * t[0] - s[1] * t[1] - s[2] * t[2] + s[3] * t[3];
    WriteVector(d, 4, _VD);
    EatPrefixes();
}

void int_bvf(u32 op)
{
    DEBUGME();
    if (!VFPUBRANCHVAL)
        cpu_delayBranchTo(BRANCHADDR);
}

void int_bvt(u32 op)
{
    DEBUGME();
    if (VFPUBRANCHVAL)
        cpu_delayBranchTo(BRANCHADDR);
}

void int_bvfl(u32 op)
{
    DEBUGME();
    if (!VFPUBRANCHVAL)
        cpu_delayBranchTo(BRANCHADDR);
    else
        PC += 4;
}

void int_bvtl(u32 op)
{
    DEBUGME();
    if (VFPUBRANCHVAL)
        cpu_delayBranchTo(BRANCHADDR);
    else
        PC += 4;
}

